In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
#import colorlover as cl
from IPython.display import HTML, display
from chorogrid import Colorbin, Chorogrid
In [2]:
sns.set_context("poster")
sns.set_style("ticks")
In [3]:
TOPIC_MAPPING={
"GunControl": "Gun Control",
"Privacy": "Privacy",
"Vaccine": "Vaccine",
"ChildEducation": "Child Education",
"SkinDamage": "Skin Damage",
"SeatBelt": "Seat Belt"
}
topic_order=["Gun Control", "Privacy", "Vaccine",
"Child Education", "Skin Damage", "Seat Belt"]
df = pd.read_hdf("FINAL_ANALYSIS_DATA.h5", "final_data").rename(columns={
#u'is_controvertial': u'is_controversial'
}).assign(
topic_name=lambda x: x.topic_name.apply(lambda k: TOPIC_MAPPING[k.split('/')[0]]),
)
NON_STATES = set(["UNK", "USA", "AS", "DC", "GU",
"MP", "PR", "VI"])
STATE_POPULATIONS="""4863300.00 AL
741894.00 AK
6931071.00 AZ
2988248.00 AR
39250017.00 CA
5540545.00 CO
3576452.00 CT
952065.00 DE
681170.00 DC
20612439.00 FL
10310371.00 GA
1428557.00 HI
1683140.00 ID
12801539.00 IL
6633053.00 IN
3134693.00 IA
2907289.00 KS
4436974.00 KY
4681666.00 LA
1331479.00 ME
6016447.00 MD
6811779.00 MA
9928300.00 MI
5519952.00 MN
2988726.00 MS
6093000.00 MO
1042520.00 MT
1907116.00 NE
2940058.00 NV
1334795.00 NH
8944469.00 NJ
2081015.00 NM
19745289.00 NY
10146788.00 NC
757952.00 ND
11614373.00 OH
3923561.00 OK
4093465.00 OR
12784227.00 PA
1056426.00 RI
4961119.00 SC
865454.00 SD
6651194.00 TN
27862596.00 TX
3051217.00 UT
624594.00 VT
8411808.00 VA
7288000.00 WA
1831102.00 WV
5778708.00 WI
585501.00 WY
""".splitlines()
STATE_POPULATIONS = {k:float(v) for v,k in map(lambda x: x.split('\t'), STATE_POPULATIONS)}
CHOROGRID_STATES_FILE='/content/Code/smishra8/chorogrid/chorogrid/databases/usa_states.csv'
In [4]:
STATE_POPULATIONS["AZ"]
Out[4]:
In [5]:
df.columns
Out[5]:
In [6]:
df.CATS.fillna(0).apply(
lambda x: Counter(['UNK'])
if x == 0
else Counter(x)
).apply(lambda x: len(x)).describe()
Out[6]:
In [7]:
df["CATS_Counter"] = df.CATS.fillna(0).apply(
lambda x: Counter(['NONE'])
if x == 0
else Counter(x)
)
df[df.CATS_Counter.apply(lambda x: len(x)) == 2]["CATS_Counter"].head()
Out[7]:
In [8]:
df_t = df[df.u_state != "USA"].groupby("u_state")["is_controversial"].agg([np.mean, len, np.std]).reset_index()
df_t
Out[8]:
In [9]:
mycolors = ['#b35806', '#f1a340', '#fee0b6', '#d8daeb', '#998ec3', '#542788']
mybin = Colorbin(df_t['mean'], mycolors, proportional=True, decimals=None)
mybin.set_decimals(1)
mybin.recalc(fenceposts=True)
mybin.calc_complements(0.5, '#e0e0e0', '#101010')
In [10]:
states = list(df_t.u_state)
colors_by_state = mybin.colors_out
font_colors_by_state = mybin.complements
legend_colors = mybin.colors_in
legend_labels = mybin.labels
for lst in ['states', 'colors_by_state', 'font_colors_by_state', 'legend_colors', 'legend_labels']:
obj = eval(lst)
print("{:>20}: len {:2}: {}...".format(lst, len(obj), obj[:3]))
In [11]:
cg = Chorogrid(CHOROGRID_STATES_FILE, states, colors_by_state)
cg.set_title('mean', font_dict={'font-size': 19})
cg.set_legend(legend_colors, legend_labels, title='mean')
cg.draw_multihex(spacing_dict={'margin_right': 150}) # otherwise legend will be cut off
# another strategy would be to pass a legend_offset to spacing_dict
cg.done(show=True)
In [12]:
def logit_transform(p):
eps = 1e-8
return np.log((p + eps)/(1-p + eps))
In [13]:
def plot_map(df, location_col, value_col, text_cols,
scl="Portland", title="", cbar_title="", decimals=2, value_transform=None):
#mycolors = ['#b35806', '#f1a340', '#fee0b6', '#d8daeb', '#998ec3', '#542788']
#mycolors = ['#b35806','#e08214','#fdb863','#fee0b6','#f7f7f7','#d8daeb','#b2abd2','#8073ac','#542788']
mycolors = ['#ffffd9','#edf8b1','#c7e9b4','#7fcdbb','#41b6c4','#1d91c0','#225ea8','#253494','#081d58']
values = df[value_col].astype(float)
if value_transform:
values = logit_transform(values)
mybin = Colorbin(values, mycolors,
#proportional=True,
decimals=None)
mybin.set_decimals(decimals)
mybin.recalc(fenceposts=True)
mybin.calc_complements(0.5, '#e0e0e0', '#101010')
states = list(df[location_col])
colors_by_state = mybin.colors_out
font_colors_by_state = mybin.complements
legend_colors = mybin.colors_in
legend_labels = mybin.labels
cg = Chorogrid(
CHOROGRID_STATES_FILE,
states, colors_by_state,
)
cg.set_title(title, font_dict={'font-size': 19})
cg.set_legend(legend_colors, legend_labels, title=cbar_title,
font_dict={'font-size': '10px', })
#cg.draw_squares(spacing_dict={'margin_right': 150}) # otherwise legend will be cut off
cg.draw_multihex(spacing_dict={
'margin_right': 150,
'missing_color': '#ffffff',
'stroke_color': '#000000',
'stroke_width': 0.1
}, font_dict={
'stroke-width': '0.1px',
}, font_colors=font_colors_by_state)
cg.done(show=True)
In [14]:
df_t = df[df.u_state != "USA"].groupby("u_state")["is_controversial"].agg([np.mean, len, np.std]).reset_index()
plot_map(df_t,
"u_state", "mean", ["u_state","len", "std"], scl='Portland',
title="Proportion of controversial tweets per state",
cbar_title="Proportion",
)
In [15]:
df_t = df.assign(
fakenews=df.CATS_Counter.apply(lambda x: x.get('fakenews', 0))
)[["u_state", "fakenews"]].groupby("u_state")["fakenews"].agg([np.mean, len, np.std]).reset_index()
plot_map(df_t,
"u_state", "mean", ["u_state","len", "std"], scl='Portland',
title="Proportion of fakenews urls per state",
cbar_title="Proportion"
)
In [16]:
for url_type in ["fakenews", "news", "blog"]:
df_t = df[(df.u_state != "USA")
& (df.t_n_urls > 0)].assign(**{
url_type: lambda x: x.CATS_Counter.apply(lambda k: k.get(url_type, 0))}
)[["u_state", url_type]].groupby("u_state")[url_type].agg([np.mean, len, np.std]).reset_index()
plot_map(df_t[
#(df_t["len"] > (df_t["len"].sum() * 0.01))
(df_t["len"] >= (df_t["len"].sort_values().values[-10]))
& (~df_t["u_state"].isin(NON_STATES))
],
"u_state", "mean", ["u_state","len", "std"], scl='Portland',
title="Proportion of %s urls (in tweets with URLs) per state" % url_type.title(),
cbar_title="Proportion"
)
In [17]:
df.topic_name.value_counts()
Out[17]:
In [18]:
def plot_by_topic(df, url_type, nstates=10):
display(HTML("<h2>{}</h2>".format(url_type.upper())))
total_population = sum(STATE_POPULATIONS.values())
for topic in topic_order:
df_t = df[(df.u_state != "USA")
& (df.t_n_urls > 0)
& (df.topic_name == topic)
].assign(
**{
url_type: lambda x: x.CATS_Counter.apply(lambda k: k.get(url_type, 0))
})[["u_state", url_type]].groupby("u_state")[url_type].agg([np.sum, np.mean, len, np.std]).reset_index()
df_t["value_rank"] = df_t["mean"].rank(ascending=False)
df_t = df_t.assign(mean=df_t["sum"] * total_population/df_t["u_state"].apply(
lambda k: STATE_POPULATIONS.get(k, total_population)))
plot_map(df_t[
#(df_t["len"] > (df_t["len"].sum() * 0.01))
#(df_t["len"] >= (df_t["len"].sort_values().values[nstates]))
(~df_t["u_state"].isin(NON_STATES))
],
"u_state", "mean", ["u_state", "value_rank", "mean","len", "std"], scl="Portland",
title=topic,
cbar_title="Proportion",
decimals=3,
)
nstates=None
In [19]:
url_type = "fakenews"
plot_by_topic(df, url_type, nstates=nstates)
In [20]:
url_type = "blog"
plot_by_topic(df, url_type, nstates=nstates)
In [21]:
url_type = "news"
plot_by_topic(df, url_type, nstates=nstates)
In [22]:
def plot_map_subplots(df, url_type, decimals=2, nstates=10):
display(HTML("<h2>{}</h2>".format(url_type.upper())))
data = []
COLS = 3
ROWS = 2
values_states = []
total_population = sum(STATE_POPULATIONS.values())
for i, topic in enumerate(topic_order):
x = i % COLS
y = i / COLS
df_t = df[(df.u_state != "USA")
& (df.t_n_urls > 0)
& (df.topic_name == topic)
].assign(**{
url_type: lambda x: x.CATS_Counter.apply(lambda k: k.get(url_type, 0))}
)[["u_state", url_type]].groupby("u_state")[url_type].agg([np.sum, np.mean, len, np.std]).reset_index()
df_t = df_t.assign(mean=(df_t["sum"]) / df_t["u_state"].apply(
lambda k: STATE_POPULATIONS.get(k, total_population)/total_population)
)
df_t = df_t[#(df_t["len"] >= (df_t["len"].sort_values().values[-nstates]))
(~df_t["u_state"].isin(NON_STATES))
]
values_states.append((
topic, df_t["mean"].astype(float).values.tolist(),
df_t["u_state"].values.tolist()
))
#mycolors = ['#b35806', '#f1a340', '#fee0b6', '#d8daeb', '#998ec3', '#542788']
#mycolors = ['#b35806','#e08214','#fdb863','#fee0b6','#f7f7f7','#d8daeb','#b2abd2','#8073ac','#542788']
mycolors = ['#ffffd9','#edf8b1','#c7e9b4','#7fcdbb','#41b6c4','#1d91c0','#225ea8','#253494','#081d58']
mybin = Colorbin(
sum(map(lambda x: x[1], values_states), []),
mycolors,
proportional=True,
decimals=None
)
mybin.set_decimals(3)
mybin.recalc(fenceposts=True)
mybin.calc_complements(0.5, '#e0e0e0', '#101010')
colors_by_state_all = mybin.colors_out
font_colors_by_state_all = mybin.complements
legend_colors = mybin.colors_in
legend_labels = mybin.labels
curr_idx = 0
for i, topic in enumerate(topic_order):
states = values_states[i][2]
colors_by_state = colors_by_state_all[curr_idx:curr_idx+len(states)]
font_colors_by_state = font_colors_by_state_all[curr_idx:curr_idx+len(states)]
curr_idx += len(states)
cg = Chorogrid(CHOROGRID_STATES_FILE, states, colors_by_state)
cg.set_title(topic, font_dict={'font-size': 19})
cg.set_legend(legend_colors, legend_labels, title="Proportion",
font_dict={'font-size': '10px', })
#cg.draw_squares(spacing_dict={'margin_right': 150}) # otherwise legend will be cut off
cg.draw_multihex(spacing_dict={
'margin_right': 150,
'missing_color': '#ffffff',
'stroke_color': '#000000',
'stroke_width': 0.1
}, font_dict={
'stroke-width': '0.1px',
}, font_colors=font_colors_by_state)
cg.done(show=True)
In [23]:
plot_map_subplots(df, url_type="fakenews", decimals=2, nstates=nstates)
In [24]:
plot_map_subplots(df, url_type="blog", decimals=2, nstates=nstates)
In [25]:
plot_map_subplots(df, url_type="news", decimals=2, nstates=nstates)
In [26]:
def plot_map_subplots(df, url_type, decimals=2):
display(HTML("<h2>{}</h2>".format(url_type.upper())))
data = []
COLS = 3
ROWS = 2
values_states = []
for i, topic in enumerate(topic_order):
x = i % COLS
y = i / COLS
df_t = df[(df.u_state != "USA")
& (df.t_n_urls > 0)
& (df.topic_name == topic)
].assign(**{
url_type: lambda x: x.CATS_Counter.apply(lambda k: k.get(url_type, 0))}
)[["u_state", url_type]].groupby("u_state")[url_type].agg([np.mean, len, np.std]).reset_index()
df_t = df_t[(df_t["len"] >= (df_t["len"].sort_values().values[-10]))
& (~df_t["u_state"].isin(NON_STATES))
]
values_states.append((
topic, df_t["mean"].astype(float).values.tolist(),
df_t["u_state"].values.tolist()
))
#mycolors = ['#b35806', '#f1a340', '#fee0b6', '#d8daeb', '#998ec3', '#542788']
mycolors = ['#b35806','#e08214','#fdb863','#fee0b6','#f7f7f7','#d8daeb','#b2abd2','#8073ac','#542788']
mybin = Colorbin(
sum(map(lambda x: x[1], values_states), []),
mycolors,
proportional=True,
decimals=None
)
mybin.set_decimals(3)
mybin.recalc(fenceposts=True)
mybin.calc_complements(0.5, '#e0e0e0', '#101010')
colors_by_state_all = mybin.colors_out
font_colors_by_state_all = mybin.complements
legend_colors = mybin.colors_in
legend_labels = mybin.labels
curr_idx = 0
for i, topic in enumerate(topic_order):
states = values_states[i][2]
colors_by_state = colors_by_state_all[curr_idx:curr_idx+len(states)]
font_colors_by_state = font_colors_by_state_all[curr_idx:curr_idx+len(states)]
curr_idx += len(states)
cg = Chorogrid(CHOROGRID_STATES_FILE, states, colors_by_state)
cg.set_title(topic, font_dict={'font-size': 19})
cg.set_legend(legend_colors, legend_labels, title="Proportion",
font_dict={'font-size': '10px', })
#cg.draw_squares(spacing_dict={'margin_right': 150}) # otherwise legend will be cut off
cg.draw_multihex(spacing_dict={
'margin_right': 150,
'missing_color': '#ffffff',
'stroke_color': '#000000',
'stroke_width': 0.1
}, font_dict={
'stroke-width': '0.1px',
}, font_colors=font_colors_by_state)
cg.done(show=True)
In [27]:
df_topics = {}
for topic in topic_order:
df_t = df[(df.u_state != "USA")
& (df.t_n_urls > 0)
& (df.topic_name == topic)
].assign(
fakenews=lambda x: x.CATS_Counter.apply(lambda k: k.get('fakenews', 0))
)[["u_state", "fakenews"]].groupby("u_state")["fakenews"].agg([np.mean, len, np.std]).reset_index()
df_t["value_rank"] = df_t["mean"].rank(ascending=False)
df_topics[topic] = (df_t[
(df_t["len"] >= (df_t["len"].sort_values().values[-10]))
#(df_t["len"] > (df_t["len"].sum() * 0.01)
].sort_values("mean",
ascending=False).reset_index().apply(
lambda x: "%s (%.2f) [%s]" % (
x["u_state"], x["mean"], x["len"]), axis=1))
pd.concat(df_topics, axis=1, keys=topic_order)
Out[27]:
In [28]:
fig, ax = plt.subplots(1,1,figsize=(15,5))
with sns.plotting_context(
rc={"axes.titlesize": 14,
"axes.labelsize": 14,
"xtick.labelsize": 12,
"ytick.labelsize": 14,
}), sns.axes_style(
rc={"font.family": "monospace"}):
g = sns.barplot(y="is_controversial", x="u_state",
errwidth=2,
data=df[~df.u_state.isin(NON_STATES)].sort_values("u_state"),
ax=ax, color="0.7")
ax.axhline(y=0.5, linestyle='--', color="k", lw=1.)
ax.set_ylabel("Proportion of controversial tweets")
ax.set_xlabel("US States")
#ax.tick_params(axis='x', which='major', labelsize=10)
sns.despine(offset=10)
In [29]:
LOCATION_ORDER = (["UNK", "USA"] + sorted(set(
df.u_state.fillna("UNK").value_counts().index
) - NON_STATES)+ sorted(["AS", "DC", "GU",
"MP", "PR", "VI"]))
colors = ["b"] * 2 + ["r"]*50 +["0.7"]*6
fig, ax = plt.subplots(1,1,figsize=(16,5))
with sns.plotting_context(
rc={"axes.titlesize": 14,
"axes.labelsize": 14,
"xtick.labelsize": 12,
"ytick.labelsize": 14,
}), sns.axes_style(
rc={"font.family": "monospace"}):
g = sns.barplot(y="is_controversial", x="u_state",
errwidth=2,
data=df.assign(u_state=df.u_state.fillna("UNK")),
ax=ax, color="r", order=LOCATION_ORDER)
ax.axhline(y=0.5, linestyle='--', color="k", lw=1.)
ax.set_ylabel("Proportion of controversial tweets")
ax.set_xlabel("US States")
#ax.tick_params(axis='x', which='major', labelsize=10)
[ax.patches[i].set_color(c) for i, c in enumerate(colors)]
sns.despine(offset=10)
plt.setp(ax.get_xticklabels()[:3], rotation=90)
In [30]:
LOCATION_ORDER = (["UNK", "USA"] + sorted(set(
df.u_state.fillna("UNK").value_counts().index
) - NON_STATES)+ sorted(["AS", "DC", "GU",
"MP", "PR", "VI"]))
colors = ["b"] * 2 + ["r"]*50 +["k"]*6
total_controversial = df[(df.is_controversial == 1) & (~df.u_state.isin(NON_STATES))].shape[0] * 1.
fig, ax = plt.subplots(1,1,figsize=(16,5))
with sns.plotting_context(
rc={"axes.titlesize": 14,
"axes.labelsize": 14,
"xtick.labelsize": 12,
"ytick.labelsize": 14,
}), sns.axes_style(
rc={"font.family": "monospace"}):
g = sns.barplot(y="is_controversial", x="u_state",
data=df[
(df.is_controversial == 1)
& (~df.u_state.isin(NON_STATES))
],
ax=ax, color="0.5",
order=LOCATION_ORDER[2:-6],
ci=None, estimator=lambda x: len(x)/total_controversial)
#ax.axhline(y=0.5, linestyle='--', color="k", lw=1.)
ax.set_ylabel("Distribution of controversial tweets\nacross states")
ax.set_xlabel("US States")
#ax.tick_params(axis='x', which='major', labelsize=10)
#[ax.patches[i].set_color(c) for i, c in enumerate(colors)]
sns.despine(offset=10)
#plt.setp(ax.get_xticklabels()[:3], rotation=90)
In [31]:
df_t = df[(~df.u_state.isin(NON_STATES)) & (~df.u_state.isnull())].pivot_table(
index="u_state", columns="topic_name", values="t_id", aggfunc=len)
with sns.plotting_context(
rc={"axes.titlesize": 10,
"axes.labelsize": 10,
"xtick.labelsize": 10,
"ytick.labelsize": 10,
}), sns.axes_style(
rc={"font.family": "monospace"}):
g = sns.PairGrid(df_t.divide(df_t.sum(axis=0), axis=1).reset_index(),
x_vars=topic_order, y_vars=["u_state"],
size=10, aspect=.25)
g.map(sns.stripplot, size=10, orient="h",
color="k", edgecolor="gray")
# Use the same x axis limits on all columns and add better labels
g.set(xlabel="proportion", ylabel="",)
# Use semantically meaningful titles for the columns
titles = topic_order
for ax, title in zip(g.axes.flat, titles):
# Set a different title for each axes
ax.set(title=title)
# Make the grid horizontal instead of vertical
ax.xaxis.grid(False)
ax.yaxis.grid(True)
sns.despine(left=True, bottom=True)
# Draw a dot plot using the stripplot function
In [32]:
LOCATION_ORDER = (["UNK", "USA"] + sorted(set(
df.u_state.fillna("UNK").value_counts().index
) - NON_STATES)+ sorted(["AS", "DC", "GU",
"MP", "PR", "VI"]))
colors = ["b"] * 2 + ["r"]*50 +["0.7"]*6
with sns.plotting_context(
rc={"axes.titlesize": 14,
"axes.labelsize": 14,
"xtick.labelsize": 12,
"ytick.labelsize": 14,
}), sns.axes_style(
rc={"font.family": "monospace"}):
fig, ax = plt.subplots(1,1, figsize=(20,5))
ax = sns.countplot(df.u_state.fillna("UNK"), color='k', ax=ax,
order=LOCATION_ORDER)
ax.set_yscale('log')
ax.set_ylabel('Frequency')
ax.set_xlabel('Tweet author location')
plt.xticks(rotation='vertical')
#sns.despine(offset=2)
[ax.patches[i].set_color(c) for i, c in enumerate(colors)]
In [33]:
pd.concat([pd.DataFrame(k.reset_index().values, columns=["Location", "Counts"])
for k in np.array_split(df.u_state.fillna("UNK").value_counts(), 4, axis=0)], axis=1)
Out[33]:
In [34]:
df.u_state.describe()
Out[34]:
In [35]:
df.u_state.shape
Out[35]:
In [36]:
df.groupby("u_id")["u_state"].first().shape, df.groupby("u_id")["u_state"].first().describe()
Out[36]:
In [ ]: